import os
import sys
import pysam

dataset, library = sys.argv[1:]

def generate_blocks(alignments):
    current = ""
    block = []
    for alignment in alignments:
        if alignment.query_name != current:
            yield block
            current = alignment.query_name
            block = []
        block.append(alignment)
    yield block

def generate_blocks_miseq(alignments):
    current = ""
    block = []
    for alignment1 in alignments:
        alignment2 = next(alignments)
        assert alignment1.query_name == alignment2.query_name
        if alignment1.query_name != current:
            yield block
            current = alignment1.query_name
            block = []
        alignment = (alignment1, alignment2)
        block.append(alignment)
    yield block

directory = "/osc-fs_home/mdehoon/Data/CASPARs/%s/Mapping/" % dataset
filename = "%s.bam" % library
path = os.path.join(directory, filename)
print("Reading", path)
alignments = pysam.Samfile(path)
print("Writing", filename)
output = pysam.Samfile(filename, "wb", template=alignments)
if dataset == "MiSeq":
    for block in generate_blocks_miseq(alignments):
        count = len(block)
        for i, alignment in enumerate(block):
            alignment1, alignment2 = alignment
            if alignment1.is_unmapped:
                assert alignment2.is_unmapped
                assert count == 1
            else:
                assert not alignment2.is_unmapped
                alignment1.set_tag("NH", count)
                alignment1.set_tag("HI", i)
            output.write(alignment1)
            output.write(alignment2)
else:
    for block in generate_blocks(alignments):
        count = len(block)
        for i, alignment in enumerate(block):
            if alignment.is_unmapped:
                assert count == 1
            else:
                alignment.set_tag("NH", count)
                alignment.set_tag("HI", i)
            output.write(alignment)
alignments.close()
output.close()
